Goal: can machine learning methods help us to associate metabolites with leaf length?
Previously (script 11b2) I filtered out unnamed metabolites. Here I keep them all. but remove blank samples
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.0-2
library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
## Loading required package: survey
## Loading required package: grid
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::select() masks MASS::select()
## x tidyr::unpack() masks Matrix::unpack()
get leaflength data
leaflength <- read_csv("../../plant/output/leaf_lengths_metabolite.csv") %>%
mutate(pot=str_pad(pot, width=3, pad="0"),
sampleID=str_c("wyo", genotype, pot, sep="_"),
group=str_c(soil,genotype,trt,sep="_")) %>%
select(sampleID, group, leaf_avg_std) %>%
filter(!str_detect(group, "BLANK"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## pot = col_double(),
## soil = col_character(),
## genotype = col_character(),
## trt = col_character(),
## leaf_avg = col_double(),
## leaf_avg_std = col_double()
## )
length(unique(leaflength$group))
## [1] 4
leaflength %>% arrange(sampleID)
get and wrangle metabolite data
met_raw <-read_csv("../input/metabolites_set1.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## tissue = col_character(),
## soil = col_character(),
## genotype = col_character(),
## autoclave = col_character(),
## time_point = col_character(),
## concatenate = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
met <- met_raw %>%
mutate(pot=str_pad(pot, width = 3, pad = "0")) %>%
mutate(sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
filter(soil!="BLANK") %>%
select(sampleID, genotype, tissue, sample_mass = `sample_mass mg`, !submission_number:concatenate) %>%
pivot_longer(!sampleID:sample_mass, names_to = "metabolite", values_to = "met_amount") %>%
#adjust by sample mass
mutate(met_per_mg=met_amount/sample_mass) %>%
#scale and center
group_by(metabolite, genotype, tissue) %>%
mutate(met_per_mg=scale(met_per_mg),
met_amt=scale(met_amount)
) %>%
pivot_wider(id_cols = sampleID,
names_from = c(tissue, metabolite),
values_from = starts_with("met_"),
names_sep = "_")
met
split this into two data frames, one normalized by tissue amount and one not.
met_per_mg <- met %>% select(sampleID, starts_with("met_per_mg")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
met_amt <- met %>% select(sampleID, starts_with("met_amt")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
get leaf data order to match
leaflength <- leaflength[match(met$sampleID, leaflength$sampleID),]
leaflength
met_per_mg.leaf_PCA <- met_per_mg %>%
select(matches("_leaf_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.leaf_PCA)
## [1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_per_mg.leaf_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_per_mg.leaf_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, normalized leaf metabolites")
met_per_mg.root_PCA <- met_per_mg %>%
select(matches("_root_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.root_PCA)
## [1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_per_mg.root_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_per_mg.root_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, normalized root metabolites")
met_amt.leaf_PCA <- met_amt %>%
select(matches("_leaf_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.leaf_PCA)
## [1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_amt.leaf_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_amt.leaf_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, raw leaf metabolites")
met_amt.root_PCA <- met_amt %>%
select(matches("_root_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.root_PCA)
## [1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_amt.root_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_amt.root_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, raw root metabolites")
are the PCs normalized?
colMeans(met_amt.leaf_PCA$x) %>% round(3) #yes centered
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24
## 0 0 0 0 0 0 0 0
apply(met_amt.leaf_PCA$x, 2, sd) %>% round(2) #not scaled
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13
## 11.89 9.03 6.73 6.47 5.80 5.78 5.37 5.08 4.58 4.40 4.36 4.35 4.03
## PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24
## 3.83 3.76 3.64 3.55 3.46 3.30 3.19 3.14 3.06 0.00 0.00
combine the leaf and root, and then scale them:
met_per_mg.leaf_PCs <- met_per_mg.leaf_PCA$x
colnames(met_per_mg.leaf_PCs) <- str_c("leaf_", colnames(met_per_mg.leaf_PCs))
met_per_mg.root_PCs <- met_per_mg.root_PCA$x
colnames(met_per_mg.root_PCs) <- str_c("root_", colnames(met_per_mg.root_PCs))
met_per_mg.PCs <- cbind(met_per_mg.leaf_PCs, met_per_mg.root_PCs) %>%
scale()
met_amt.leaf_PCs <- met_amt.leaf_PCA$x
colnames(met_amt.leaf_PCs) <- str_c("leaf_", colnames(met_amt.leaf_PCs))
met_amt.root_PCs <- met_amt.root_PCA$x
colnames(met_amt.root_PCs) <- str_c("root_", colnames(met_amt.root_PCs))
met_amt.PCs <- cbind(met_amt.leaf_PCs, met_amt.root_PCs) %>%
scale()
also combine the rotations
met_per_mg.leaf_rotation <- met_per_mg.leaf_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("leaf_", .x)) %>%
rownames_to_column("metabolite")
met_per_mg.root_rotation <- met_per_mg.root_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("root_", .x)) %>%
rownames_to_column("metabolite")
met_per_mg.PC_rotation <- full_join(met_per_mg.leaf_rotation, met_per_mg.root_rotation, by="metabolite")
met_amt.leaf_rotation <- met_amt.leaf_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("leaf_", .x)) %>%
rownames_to_column("metabolite")
met_amt.root_rotation <- met_amt.root_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("root_", .x)) %>%
rownames_to_column("metabolite")
met_amt.PC_rotation <- full_join(met_amt.leaf_rotation, met_amt.root_rotation, by="metabolite")
met_per_mg_fit1LOO <- cv.glmnet(x=met_per_mg.PCs, y=leaflength$leaf_avg_std, nfolds = nrow(met_per_mg.PCs), alpha=1 )
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per
## fold
plot(met_per_mg_fit1LOO)
bestlam=met_per_mg_fit1LOO$lambda.1se
NEXT STEP: Do a K-fold CV, repeat many times and average. Might as well do alpha while we are at it. If we are doing alpha, then we need to manually create our own folds list for each run
Fit 101 CVs for each of 11 alphas
set.seed(1245)
folds <- tibble(run=1:101) %>%
mutate(folds=map(run, ~ sample(rep(1:4,6))))
system.time (met_per_mg_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
left_join(folds, by="run") %>%
mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_per_mg.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
)))
#, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
## user system elapsed
## 47.257 1.434 52.780
head(met_per_mg_multiCV)
for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda
met_per_mg_multiCV <- met_per_mg_multiCV %>%
mutate(cvm=map(fit, magrittr::extract("cvm")),
lambda=map(fit, magrittr::extract("lambda")),
lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
nzero=map(fit, magrittr::extract("nzero"))
)
head(met_per_mg_multiCV)
now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works
met_per_mg_summary_cvm <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds) %>%
unnest(c(cvm, lambda)) %>%
group_by(alpha, lambda) %>%
summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
## `summarise()` regrouping output by 'alpha' (override with `.groups` argument)
met_per_mg_summary_cvm
met_per_mg_summary_lambda <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>%
group_by(alpha) %>%
summarize(
lambda.min.sd=sd(lambda.min),
lambda.min.mean=mean(lambda.min),
#lambda.min.med=median(lambda.min),
lambda.min.high=lambda.min.mean+lambda.min.sd,
#lambda.min.low=lambda.min.mean-lambda.min.sem,
#lambda.1se.sem=sd(lambda.1se)/sqrt(n()),
lambda.1se.mean=mean(lambda.1se),
#lambda.1se.med=median(lambda.1se),
#lambda.1se.high=lambda.1se+lambda.1se.sem,
#lambda.1se.low=lambda.1se-lambda.1se.sem,
nzero=nzero[1],
lambda=lambda[1]
)
## `summarise()` ungrouping output (override with `.groups` argument)
met_per_mg_summary_lambda
plot it
met_per_mg_summary_cvm %>%
#filter(alpha!=0) %>% # worse than everything else and throwing the plots off
ggplot(aes(x=log(lambda), y= meancvm, ymin=low, ymax=high)) +
geom_ribbon(alpha=.25) +
geom_line(aes(color=as.character(alpha))) +
facet_wrap(~ as.character(alpha)) +
coord_cartesian(xlim=(c(-5,0))) +
geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_per_mg_summary_lambda) +
geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_per_mg_summary_lambda, color="blue")
So overall these look more reasonable than the LOO plot.
Make a plot of MSE at minimum lambda for each alpha
met_per_mg_summary_cvm %>%
group_by(alpha) %>%
filter(rank(meancvm, ties.method = "first")==1) %>%
ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
geom_ribbon(color=NA, fill="gray80") +
geom_line() +
geom_point()
not a particular large difference there, aside from 0.1 and even then, not too much better.
Plot the number of nzero coefficients
met_per_mg_summary_lambda %>%
unnest(c(lambda, nzero)) %>%
group_by(alpha) %>%
filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda)) ) %>%
ungroup() %>%
ggplot(aes(x=as.character(alpha), y=nzero)) +
geom_point() +
ggtitle("Number of non-zero coefficents at minimum lambda") +
ylim(0,24)
## Warning: Removed 1 rows containing missing values (geom_point).
OK let’s do repeated test train starting from these CV lambdas
multi_tt <- function(lambda, alpha, n=10000, sample_size=24, train_size=20, x, y=leaflength$leaf_avg_std) {
print(lambda)
print(alpha)
tt <-
tibble(run=1:n) %>%
mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y]) )) %>%
mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
summarize(
num_na=sum(is.na(cor)),
num_lt_0=sum(cor<=0, na.rm=TRUE),
avg_cor=mean(cor, na.rm=TRUE),
avg_MSE=mean(MSE))
tt
}
per_mg_fit_test_train <- met_per_mg_summary_lambda %>%
select(alpha, lambda.min.mean)
per_mg_fit_test_train <- met_per_mg_multiCV %>%
filter(run==1) %>%
select(alpha, fit) %>%
right_join(per_mg_fit_test_train)
## Joining, by = "alpha"
per_mg_fit_test_train <- per_mg_fit_test_train %>%
mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_per_mg.PCs)),
full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_per_mg.PCs)))
## [1] 63.76959
## [1] 0
## [1] 3.23426
## [1] 0.1
## [1] 1.96389
## [1] 0.2
## [1] 1.390028
## [1] 0.3
## [1] 1.057486
## [1] 0.4
## [1] 0.8344099
## [1] 0.5
## [1] 0.7009496
## [1] 0.6
## [1] 0.5978767
## [1] 0.7
## [1] 0.527807
## [1] 0.8
## [1] 0.4681233
## [1] 0.9
## [1] 0.4171747
## [1] 1
(per_mg_fit_test_train <- per_mg_fit_test_train %>% unnest(tt))
per_mg_fit_test_train %>%
ggplot(aes(x=alpha)) +
geom_line(aes(y=avg_cor), color="red") +
geom_point(aes(y=avg_cor), color="red") +
geom_line(aes(y=avg_MSE), color="blue") +
geom_point(aes(y=avg_MSE), color="blue")
alpha of 0.8 to 1.0 are very similar and are the best here.
alpha_per_mg <- .8
best_per_mg <- per_mg_fit_test_train %>% filter(alpha == alpha_per_mg)
best_per_mg_fit <- best_per_mg$fit[[1]]
best_per_mg_lambda <- best_per_mg$lambda.min.mean
per_mg_coef.tb <- coef(best_per_mg_fit, s=best_per_mg_lambda) %>%
as.matrix() %>% as.data.frame() %>%
rownames_to_column(var="PC") %>%
rename(beta=`1`)
per_mg_coef.tb %>% filter(beta!=0) %>% arrange(beta)
pred and obs
plot(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]])
cor.test(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]]) #.57
##
## Pearson's product-moment correlation
##
## data: leaflength$leaf_avg_std and best_per_mg$pred_full[[1]]
## t = 3.7902, df = 22, p-value = 0.001005
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3015850 0.8231987
## sample estimates:
## cor
## 0.6285173
best_per_mg$full_MSE
## [1] 0.8478287
per_mg_vars <- per_mg_coef.tb %>%
filter(beta !=0, PC!="(Intercept)") %>%
pull(PC) %>% c("leaf_avg_std", .)
per_mg_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_per_mg.PCs) %>% as.data.frame() %>% dplyr::select(all_of(per_mg_vars)) %>%
calc.relimp()
per_mg_coef.tb <- per_mg_relimp@lmg %>% as.matrix() %>% as.data.frame() %>%
rownames_to_column("PC") %>%
rename(PropVar_met_per_mg=V1) %>%
full_join(per_mg_coef.tb) %>%
arrange(desc(PropVar_met_per_mg))
## Joining, by = "PC"
per_mg_coef.tb
Checkout the rotations.
met_per_mg_rotation_out <- met_per_mg.PC_rotation %>%
pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
filter(PC %in% filter(per_mg_coef.tb, beta!=0)$PC ) %>%
group_by(PC) %>%
filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
filter(abs(loading) >= 0.05) %>%
left_join(per_mg_coef.tb, by="PC") %>%
arrange(desc(abs(beta)), desc(abs(loading))) %>%
mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
transformation="normalized",
metabolite=str_remove(metabolite, "met_per_mg_(root|leaf)_"),
metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_per_mg_rotation_out %>% write_csv("../output/Leaf_associated_metabolites_normalized_NOBLANK.csv")
Fit 101 CVs for each of 11 alphas
set.seed(1245)
folds <- tibble(run=1:101) %>%
mutate(folds=map(run, ~ sample(rep(1:6,4))))
system.time (met_amt_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
left_join(folds, by="run") %>%
mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_amt.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
)))
#, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
## user system elapsed
## 68.531 2.282 81.667
head(met_amt_multiCV)
for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda
met_amt_multiCV <- met_amt_multiCV %>%
mutate(cvm=map(fit, magrittr::extract("cvm")),
lambda=map(fit, magrittr::extract("lambda")),
lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
nzero=map(fit, magrittr::extract("nzero"))
)
head(met_amt_multiCV)
now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works
met_amt_summary_cvm <- met_amt_multiCV %>% dplyr::select(-fit, -folds) %>%
unnest(c(cvm, lambda)) %>%
group_by(alpha, lambda) %>%
summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
## `summarise()` regrouping output by 'alpha' (override with `.groups` argument)
met_amt_summary_cvm
met_amt_summary_lambda <- met_amt_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>%
group_by(alpha) %>%
summarize(
lambda.min.sd=sd(lambda.min),
lambda.min.mean=mean(lambda.min),
#lambda.min.med=median(lambda.min),
lambda.min.high=lambda.min.mean+lambda.min.sd,
#lambda.min.low=lambda.min.mean-lambda.min.sem,
#lambda.1se.sem=sd(lambda.1se)/sqrt(n()),
lambda.1se.mean=mean(lambda.1se),
#lambda.1se.med=median(lambda.1se),
#lambda.1se.high=lambda.1se+lambda.1se.sem,
#lambda.1se.low=lambda.1se-lambda.1se.sem,
nzero=nzero[1],
lambda=lambda[1]
)
## `summarise()` ungrouping output (override with `.groups` argument)
met_amt_summary_lambda
plot it
met_amt_summary_cvm %>%
#filter(alpha!=0) %>% # worse than everything else and throwing the plots off
ggplot(aes(x=log(lambda), y= meancvm, ymin=low, ymax=high)) +
geom_ribbon(alpha=.25) +
geom_line(aes(color=as.character(alpha))) +
facet_wrap(~ as.character(alpha)) +
coord_cartesian(xlim=(c(-5,0))) +
geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_amt_summary_lambda) +
geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_amt_summary_lambda, color="blue")
Make a plot of MSE at minimum lambda for each alpha
met_amt_summary_cvm %>%
group_by(alpha) %>%
filter(rank(meancvm, ties.method = "first")==1) %>%
ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
geom_ribbon(color=NA, fill="gray80") +
geom_line() +
geom_point()
not a particular large difference here after 0.2
Plot the number of nzero coefficients
met_amt_summary_lambda %>%
unnest(c(lambda, nzero)) %>%
group_by(alpha) %>%
filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda)) ) %>%
ungroup() %>%
ggplot(aes(x=as.character(alpha), y=nzero)) +
geom_point() +
ggtitle("Number of non-zero coefficents at minimum lambda") +
ylim(0,36)
## Warning: Removed 1 rows containing missing values (geom_point).
OK let’s do repeated test train starting from these CV lambdas
multi_tt <- function(lambda, alpha, n=10000, sample_size=24, train_size=20, x, y=leaflength$leaf_avg_std) {
print(lambda)
print(alpha)
tt <-
tibble(run=1:n) %>%
mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y]) )) %>%
mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
summarize(
num_na=sum(is.na(cor)),
num_lt_0=sum(cor<=0, na.rm=TRUE),
avg_cor=mean(cor, na.rm=TRUE),
avg_MSE=mean(MSE))
tt
}
amt_fit_test_train <- met_amt_summary_lambda %>%
select(alpha, lambda.min.mean)
amt_fit_test_train <- met_amt_multiCV %>%
filter(run==1) %>%
select(alpha, fit) %>%
right_join(amt_fit_test_train)
## Joining, by = "alpha"
amt_fit_test_train <- amt_fit_test_train %>%
mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_amt.PCs)),
full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_amt.PCs)))
## [1] 129.3889
## [1] 0
## [1] 2.394178
## [1] 0.1
## [1] 1.622926
## [1] 0.2
## [1] 1.129161
## [1] 0.3
## [1] 0.8782994
## [1] 0.4
## [1] 0.709034
## [1] 0.5
## [1] 0.5979773
## [1] 0.6
## [1] 0.5183774
## [1] 0.7
## [1] 0.4567253
## [1] 0.8
## [1] 0.4084716
## [1] 0.9
## [1] 0.3686524
## [1] 1
(amt_fit_test_train <- amt_fit_test_train %>% unnest(tt))
amt_fit_test_train %>%
ggplot(aes(x=alpha)) +
geom_line(aes(y=avg_cor), color="red") +
geom_point(aes(y=avg_cor), color="red") +
geom_line(aes(y=avg_MSE), color="blue") +
geom_point(aes(y=avg_MSE), color="blue")
alpha of 0.8 to 1.0 are very similar and are the best here.
alpha_amt <- .8
best_amt <- amt_fit_test_train %>% filter(alpha == alpha_amt)
best_amt_fit <- best_amt$fit[[1]]
best_amt_lambda <- best_amt$lambda.min.mean
amt_coef.tb <- coef(best_amt_fit, s=best_amt_lambda) %>%
as.matrix() %>% as.data.frame() %>%
rownames_to_column(var="PC") %>%
rename(beta=`1`)
amt_coef.tb %>% filter(beta!=0) %>% arrange(beta)
pred and obs
plot(leaflength$leaf_avg_std, best_amt$pred_full[[1]])
cor.test(leaflength$leaf_avg_std, best_amt$pred_full[[1]]) #.64
##
## Pearson's product-moment correlation
##
## data: leaflength$leaf_avg_std and best_amt$pred_full[[1]]
## t = 3.8913, df = 22, p-value = 0.0007859
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3166703 0.8285020
## sample estimates:
## cor
## 0.6385023
best_amt$full_MSE
## [1] 0.6742621
amt_vars <- amt_coef.tb %>%
filter(beta !=0, PC!="(Intercept)") %>%
pull(PC) %>% c("leaf_avg_std", .)
# because there is only one explanatory variable, we don't need to (and can't) use relimp
amt_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_amt.PCs) %>% as.data.frame() %>% dplyr::select(all_of(amt_vars)) %>% cor() %>% magrittr::extract(1,2) %>% magrittr::multiply_by(.,.)
amt_relimp <- tibble(PC=str_subset(amt_vars, "PC"),
PropVar_met_amt=amt_relimp)
amt_coef.tb <- amt_relimp %>%
full_join(amt_coef.tb) %>%
arrange(desc(PropVar_met_amt))
## Joining, by = "PC"
amt_coef.tb
Checkout the rotations.
met_amt_rotation_out <- met_amt.PC_rotation %>%
pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
filter(PC %in% filter(amt_coef.tb, beta!=0)$PC ) %>%
group_by(PC) %>%
filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
filter(abs(loading) >= 0.05) %>%
left_join(amt_coef.tb, by="PC") %>%
arrange(desc(abs(beta)), desc(abs(loading))) %>%
mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
transformation="raw",
metabolite=str_remove(metabolite, "met_amt_(root|leaf)_"),
metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_amt_rotation_out %>% write_csv("../output/Leaf_associated_metabolites_raw_NOBLANK.csv")
met_amt_rotation_out